This initial setup chunk does two things: Sets all code chunks to be displayed (echo = TRUE) in the final HTML output. Loads the NBA dataset from a CSV file into a dataframe named mvp_data
# Load necessary libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.3.2
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(lubridate)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
This section loads essential R packages: tidyverse: A collection of data manipulation and visualization packages caret: For machine learning and predictive modeling lubridate: For handling date/time data zoo: For time series analysis ggplot2: For creating static visualizations plotly: For interactive visualizations
mvp_data <- mvp_data %>%
mutate(across(where(is.numeric), ~ifelse(is.na(.), median(., na.rm = TRUE), .)))
This replaces missing values in all numeric columns with the median value of that column.
# 3. Create an MVP winner indicator
# Typically, the player with the highest award_share wins MVP
mvp_data <- mvp_data %>%
group_by(season) %>%
mutate(is_mvp = award_share == max(award_share)) %>%
ungroup()
This creates a new binary column is_mvp that identifies the player who won the MVP award in each season (the player with the highest award_share).
# 4. Feature engineering
mvp_data <- mvp_data %>%
mutate(
# Points-rebounds-assists composite
pra = pts_per_g + trb_per_g + ast_per_g,
# Efficiency metrics combination
efficiency = (ts_pct * 100) + efg_pct * 100,
# Two-way player indicator
two_way_score = obpm + dbpm,
# Value metrics
value_composite = (vorp * 5) + ws,
# Team success weight
team_success = win_loss_pct * 100,
# Games played percentage (approximating)
games_played_pct = g / 82,
# Position indicators
is_guard = ifelse(pos %in% c("PG", "SG", "G"), 1, 0),
is_forward = ifelse(pos %in% c("SF", "PF", "F"), 1, 0),
is_center = ifelse(pos %in% c("C"), 1, 0)
)
This creates several new features to better capture player performance:
pra: Combined points, rebounds, and assists per game efficiency: Combined true shooting and effective field goal percentages two_way_score: Combined offensive and defensive box plus/minus value_composite: Weighted combination of VORP and Win Shares team_success: Team’s win percentage (scaled to 0-100) games_played_pct: Percentage of games played in a season Position indicators: Binary variables for player positions
# 5. Normalize features to avoid scale issues
preproc_params <- preProcess(mvp_data %>% select(where(is.numeric)), method = c("center", "scale"))
mvp_data_scaled <- predict(preproc_params, mvp_data)
This standardizes all numeric features (centering around mean 0 with standard deviation 1) to ensure all variables are on the same scale for modeling.
# 6. Create time-based features to capture trends
mvp_data_scaled <- mvp_data_scaled %>%
group_by(player) %>%
mutate(
pts_trend = pts_per_g - lag(pts_per_g, default = first(pts_per_g)),
ws_trend = ws - lag(ws, default = first(ws))
) %>%
ungroup()
This adds two trend features that capture how a player’s performance changed from the previous season:
pts_trend: Change in points per game ws_trend: Change in win shares
# 7. Split into training and testing datasets
# Let's use data up to 2018 for training and 2019-2022 for testing
train_data <- mvp_data_scaled %>% filter(season <= 1.22)
test_data <- mvp_data_scaled %>% filter(season > 1.22)
This splits the data into training (seasons up to 1.22) and testing (seasons after 1.22) sets.
library(glmnet) # For LASSO and Ridge regression
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(xgboost) # For XGBoost
## Warning: package 'xgboost' was built under R version 4.3.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:plotly':
##
## slice
## The following object is masked from 'package:dplyr':
##
## slice
library(e1071) # For Support Vector Machines
## Warning: package 'e1071' was built under R version 4.3.3
library(keras) # For neural networks
## Warning: package 'keras' was built under R version 4.3.2
library(rpart) # For decision trees
## Warning: package 'rpart' was built under R version 4.3.3
library(kernlab) # For Gaussian Process
## Warning: package 'kernlab' was built under R version 4.3.3
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
This loads additional packages needed for various machine learning algorithms.
correlation_matrix <- cor(
train_data %>%
select(award_share, pts_per_g, trb_per_g, ast_per_g, stl_per_g, blk_per_g,
ts_pct, ws, vorp, bpm, win_loss_pct, pra, efficiency,
two_way_score, value_composite, team_success, games_played_pct)
)
# Find correlations with award_share
award_share_correlations <- correlation_matrix[1, -1]
correlation_df <- data.frame(
Variable = names(award_share_correlations),
Correlation = abs(award_share_correlations)
)
correlation_df <- correlation_df[order(correlation_df$Correlation, decreasing = TRUE), ]
# Select top features based on correlation
top_features <- correlation_df$Variable[1:10]
This performs feature selection by:
Calculating correlations between key variables and award_share Creating a dataframe of these correlations Sorting to find the variables most strongly correlated with MVP award shares Selecting the top 10 most correlated features
# Create formula with top features
formula_str <- paste("award_share ~", paste(top_features, collapse = " + "))
formula_obj <- as.formula(formula_str)
This creates an R formula object using the top 10 features for model building.
# Define common training control
train_control <- trainControl(
method = "cv", # Cross-validation
number = 5, # 5-fold
verboseIter = FALSE
)
This sets up 5-fold cross-validation for model training.
# Create a matrix version of the data for models that require it
x_train <- model.matrix(formula_obj, train_data)[,-1] # Remove intercept
y_train <- train_data$award_share
x_test <- model.matrix(formula_obj, test_data)[,-1] # Remove intercept
y_test <- test_data$award_share
This creates matrix versions of the training and testing data, which are required for some algorithms like LASSO and Ridge regression.
# 1. Linear Regression (baseline)
lm_model <- train(
formula_obj,
data = train_data,
method = "lm",
trControl = train_control
)
# 2. LASSO Regression (L1 regularization)
# This trains a LASSO regression model (linear regression with L1 regularization), which can help with feature selection by shrinking some coefficients to exactly zero.
lasso_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(
alpha = 1, # LASSO
lambda = seq(0.001, 0.1, by = 0.001)
)
)
# 3. Ridge Regression (L2 regularization)
# This trains a Ridge regression model (linear regression with L2 regularization), which helps prevent overfitting by shrinking coefficients toward zero.
ridge_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(
alpha = 0, # Ridge
lambda = seq(0.001, 0.1, by = 0.001)
)
)
# 4. Elastic Net (combination of L1 and L2)
# This trains an Elastic Net model, which combines both L1 and L2 regularization.
elastic_net_model <- train(
x = x_train,
y = y_train,
method = "glmnet",
trControl = train_control,
tuneGrid = expand.grid(
alpha = seq(0, 1, by = 0.2), # Mix of LASSO and Ridge
lambda = seq(0.001, 0.1, by = 0.005)
)
)
# 5. Gradient Boosting Machine (still keeping this one)
# This trains a Gradient Boosting Machine model, which creates an ensemble of decision trees.
gbm_model <- train(
formula_obj,
data = train_data,
method = "gbm",
trControl = train_control,
verbose = FALSE
)
# 6. Support Vector Regression
svr_model <- train(
formula_obj,
data = train_data,
method = "svmRadial",
trControl = train_control,
tuneLength = 5
)
# 7. K-Nearest Neighbors
knn_model <- train(
formula_obj,
data = train_data,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = 1:20)
)
# Store models in a list
models <- list(
LinearRegression = lm_model,
LASSO = lasso_model,
Ridge = ridge_model,
ElasticNet = elastic_net_model,
GradientBoosting = gbm_model,
SupportVectorRegression = svr_model,
KNN = knn_model
)
# Make predictions with each model
predictions <- list()
for (model_name in names(models)) {
# Handle different prediction methods for different model types
if (model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
# Models trained on matrix format
predictions[[model_name]] <- predict(models[[model_name]], newdata = x_test)
} else {
# Models trained on formula
predictions[[model_name]] <- predict(models[[model_name]], newdata = test_data)
}
}
This generates predictions for each model on the test data, handling the different input formats required by different models.
# Calculate RMSE for each model
calculate_rmse <- function(pred, actual) {
sqrt(mean((pred - actual)^2))
}
model_rmse <- sapply(predictions, calculate_rmse, actual = y_test)
print(model_rmse)
## LinearRegression LASSO Ridge
## 0.7921366 0.7925482 0.8000463
## ElasticNet GradientBoosting SupportVectorRegression
## 0.7925482 0.4952495 0.6320104
## KNN
## 0.4938155
This calculates the Root Mean Square Error (RMSE) for each model and prints the results.
# Find the best model (lowest RMSE)
best_model_name <- names(which.min(model_rmse))
best_model <- models[[best_model_name]]
cat("Best model:", best_model_name, "with RMSE:", min(model_rmse), "\n")
## Best model: KNN with RMSE: 0.4938155
This identifies the model with the lowest RMSE (best performance).
# Make predictions with the best model
if (best_model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
test_data$predicted_award_share <- predict(best_model, newdata = x_test)
} else {
test_data$predicted_award_share <- predict(best_model, newdata = test_data)
}
# For each season, predict the MVP (highest predicted award_share)
predicted_mvps <- test_data %>%
group_by(season) %>%
slice_max(order_by = predicted_award_share, n = 1) %>%
select(season, player, predicted_award_share, award_share, is_mvp)
This identifies the predicted MVP for each season (player with highest predicted award_share).
# Calculate accuracy
actual_mvps <- test_data %>%
group_by(season) %>%
slice_max(order_by = award_share, n = 1) %>%
select(season, player, award_share)
comparison <- merge(predicted_mvps, actual_mvps, by = "season", suffixes = c("_predicted", "_actual"))
correct_predictions <- sum(comparison$player_predicted == comparison$player_actual)
total_seasons <- nrow(comparison)
mvp_accuracy <- correct_predictions / total_seasons
cat("MVP Prediction Accuracy:", mvp_accuracy * 100, "%\n")
## MVP Prediction Accuracy: 50 %
This calculates the percentage of seasons where the model correctly predicted the MVP winner.
# Extract and analyze feature importance for the best model
if (best_model_name == "LinearRegression") {
# For linear regression
coefficients <- coef(best_model$finalModel)[-1] # Remove intercept
var_importance <- data.frame(
Variable = names(coefficients),
Importance = abs(coefficients)
)
var_importance <- var_importance[order(var_importance$Importance, decreasing = TRUE), ]
} else if (best_model_name %in% c("LASSO", "Ridge", "ElasticNet")) {
# For regularized regression
coefficients <- coef(best_model$finalModel, best_model$bestTune$lambda)[, 1][-1] # Remove intercept
var_importance <- data.frame(
Variable = names(coefficients),
Importance = abs(coefficients)
)
var_importance <- var_importance[order(var_importance$Importance, decreasing = TRUE), ]
} else if (best_model_name == "GradientBoosting") {
# For GBM
var_importance <- summary(best_model$finalModel, plotit = FALSE)
var_importance <- var_importance[order(var_importance$rel.inf, decreasing = TRUE), ]
names(var_importance)[names(var_importance) == "rel.inf"] <- "Importance"
} else {
# For other models where we may not have direct feature importance
var_importance <- correlation_df
names(var_importance)[names(var_importance) == "Correlation"] <- "Importance"
}
print(var_importance)
## Variable Importance
## vorp vorp 0.46913580
## value_composite value_composite 0.45290418
## ws ws 0.38803353
## pra pra 0.29506484
## pts_per_g pts_per_g 0.29489024
## ast_per_g ast_per_g 0.20408176
## trb_per_g trb_per_g 0.19426497
## bpm bpm 0.19305511
## two_way_score two_way_score 0.19301911
## stl_per_g stl_per_g 0.18957140
## blk_per_g blk_per_g 0.15308848
## team_success team_success 0.13730372
## win_loss_pct win_loss_pct 0.13730372
## games_played_pct games_played_pct 0.09092698
## ts_pct ts_pct 0.08804256
## efficiency efficiency 0.07878740
This extracts the feature importance from the best model, using different methods depending on the model type.
library(ggplot2)
library(plotly)
# Compare model performance
model_performance <- data.frame(
Model = names(model_rmse),
RMSE = model_rmse
)
model_performance <- model_performance[order(model_performance$RMSE), ]
# Model comparison visualization
p1 <- ggplot(model_performance, aes(x = reorder(Model, -RMSE), y = RMSE)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(title = "Model Performance Comparison", x = "Model", y = "RMSE (lower is better)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplotly(p1)
# Feature importance visualization
p2 <- ggplot(head(var_importance, 10), aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = paste("Feature Importance -", best_model_name), x = "", y = "Importance")
ggplotly(p2)
# Actual vs. Predicted visualization
results <- data.frame(
Player = test_data$player,
Season = test_data$season,
Actual = test_data$award_share,
Predicted = test_data$predicted_award_share
)
p3 <- ggplot(results, aes(x = Actual, y = Predicted)) +
geom_point(alpha = 0.5) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
theme_minimal() +
labs(title = "Actual vs Predicted MVP Award Shares",
x = "Actual Award Share",
y = "Predicted Award Share")
ggplotly(p3)
# Top 10 predicted MVPs visualization
top_predictions <- test_data %>%
arrange(desc(predicted_award_share)) %>%
head(10)
p5 <- ggplot(top_predictions, aes(x = reorder(player, predicted_award_share), y = predicted_award_share)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Top 10 Predicted MVP Candidates", x = "", y = "Predicted Award Share")
ggplotly(p5)
cat("Best model:", best_model_name, "\n")
## Best model: KNN
cat("Model RMSE:", round(min(model_rmse), 4), "\n")
## Model RMSE: 0.4938
cat("MVP Prediction Accuracy:", round(mvp_accuracy * 100, 1), "%", "\n")
## MVP Prediction Accuracy: 50 %
cat("Top 3 most important features:", "\n")
## Top 3 most important features:
cat("1.", var_importance$Variable[1], "\n")
## 1. vorp
cat("2.", var_importance$Variable[2], "\n")
## 2. value_composite
cat("3.", var_importance$Variable[3], "\n")
## 3. ws
This analysis demonstrates that our model can predict NBA MVP candidates with reasonable accuracy. The r best_model_name model performed best with an RMSE of 0.492. We were able to correctly identify the MVP winner 50% of test seasons. The most important features for predicting MVP status are r var_importance\(Variable[1], r var_importance\)Variable[2], and r var_importance$Variable[3], indicating that these statistics have the strongest influence on MVP voting patterns.
1 Collect more recent data to validate the model on the latest seasons 2 Explore additional features such as media attention metrics 3 Develop an ensemble approach combining multiple top-performing models 4 Create an interactive prediction tool for upcoming seasons